Load data, prepared fpkm values and ortholog annonation

datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))

Set columns names

colnames(rawCounts) <- datasets$setname
rownames(rawCounts) <- geneDetails$human_name 
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name 

Filter out lower 30% and mitochondrial genes

rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9 
##    30% 
## 2947.9
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,])) 
filteredRawCounts <- rawCounts[filteredNames,]

Normalize data accounting for GC content

GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)

Depth normalize using TMM scaling factors

origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
  normalizedColSums[i] <- origColSums[i]* normFactors[i]
  i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
 filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
 i <- i+1
 }

Normalize using log transforming

logTransformedDepthNormCounts <- log2(filteredDepthNormCounts+1)

Apply Combat method to correct batch effect

meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
design <- model.matrix(~1,data=meta)
combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,par.prior=TRUE)
## Found 190 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found5batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

Plot correlation heatmaps using different methods

Apply PCA method to corrected data

transposed_combat <- t(combat)
pca_proc <- prcomp(transposed_combat[,apply(transposed_combat, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retX=TRUE)

Check PCA statistics

summary(pca_proc)
## Importance of components:
##                            PC1     PC2      PC3      PC4     PC5      PC6
## Standard deviation     46.1324 34.5088 29.83714 28.62700 26.4953 25.58011
## Proportion of Variance  0.2064  0.1155  0.08636  0.07949  0.0681  0.06347
## Cumulative Proportion   0.2064  0.3220  0.40831  0.48781  0.5559  0.61938
##                             PC7      PC8      PC9     PC10     PC11     PC12
## Standard deviation     21.40856 20.27110 18.77193 18.45646 16.76072 16.33343
## Proportion of Variance  0.04446  0.03986  0.03418  0.03304  0.02725  0.02588
## Cumulative Proportion   0.66384  0.70370  0.73788  0.77092  0.79817  0.82405
##                            PC13     PC14     PC15     PC16     PC17     PC18
## Standard deviation     15.49347 15.16600 14.82063 13.83479 12.83094 12.59261
## Proportion of Variance  0.02329  0.02231  0.02131  0.01857  0.01597  0.01538
## Cumulative Proportion   0.84734  0.86965  0.89095  0.90952  0.92549  0.94087
##                            PC19     PC20     PC21     PC22    PC23    PC24
## Standard deviation     11.63296 11.16385 10.52465 10.46957 8.10270 6.59013
## Proportion of Variance  0.01313  0.01209  0.01074  0.01063 0.00637 0.00421
## Cumulative Proportion   0.95400  0.96609  0.97683  0.98747 0.99383 0.99805
##                           PC25      PC26
## Standard deviation     4.48659 8.597e-14
## Proportion of Variance 0.00195 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00

Transfer PCA data to plots

plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- pca_proc$x[,1]
plotData$PC2 <- pca_proc$x[,2]
plotData$PC3 <- pca_proc$x[,3]

Plot the first and the second principal components

Plot the first and the second principal components with centroids

Plot the first, the second and the third principal components

Test for significance of correlations between the matched tissues PC values of human and mouse

cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson") 
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC1[1:13] and plotData$PC1[14:26]
## t = 1.0409, df = 11, p-value = 0.3202
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3012326  0.7299944
## sample estimates:
##       cor 
## 0.2994546
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC2[1:13] and plotData$PC2[14:26]
## t = 5.4008, df = 11, p-value = 0.0002164
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5677182 0.9548235
## sample estimates:
##       cor 
## 0.8521479
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson") 
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC3[1:13] and plotData$PC3[14:26]
## t = 3.0563, df = 11, p-value = 0.01092
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2021453 0.8946116
## sample estimates:
##       cor 
## 0.6776541